library(plyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::id() masks plyr::id()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::mutate() masks plyr::mutate()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(here)
## here() starts at C:/Users/avery/Documents/ds241/ds241_f23/basta-dataforskare/basta-dataforskare
##
## Attaching package: 'here'
##
## The following object is masked from 'package:plyr':
##
## here
library(geojsonR)
## Warning: package 'geojsonR' was built under R version 4.3.2
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(knitr)
library(lubridate)
library(mapview)
## Warning: package 'mapview' was built under R version 4.3.2
library(gbfs)
## Warning: package 'gbfs' was built under R version 4.3.2
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(tmap)
## Warning: package 'tmap' was built under R version 4.3.2
## Breaking News: tmap 3.x is retiring. Please test v4, e.g. with
## remotes::install_github('r-tmap/tmap')
library(tidycensus)
## Warning: package 'tidycensus' was built under R version 4.3.2
library(dplyr)
library(conflicted)
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
conflicts_prefer(here::here)
## [conflicted] Will prefer here::here over any other package.
conflicts_prefer(dplyr::rename)
## [conflicted] Will prefer dplyr::rename over any other package.
conflicts_prefer(dplyr::filter)
## [conflicted] Will prefer dplyr::filter over any other package.
conflicts_prefer(dplyr::mutate)
## [conflicted] Will prefer dplyr::mutate over any other package.
Metro Station Entrances to map the location of metro, boarding data to show how many people are using the metro station, and bikeshare to show the number of people riding bikes.
All data is from the month September because there are no major holidays, the weather is still decent enough for people to ride bikes, and the number of tourists/ pleasure bike riders are reduced.
For the purpose of this project, we plan on focusing on the commuters, and plan on creating more bike locations to better suit the number of commuters.
metro <- FROM_GeoJson(here('data_raw', 'Metro_Station_Entrances_in_DC.geojson'))
metroRiders <- read.csv(here( 'Boardings by Route Table_Full Data_data.csv'))
metroLoc <- read.csv(here('data_raw', 'Metro_Stations_Regional.csv'))
sept_raw <- read_csv(here( '202309-capitalbikeshare-tripdata.csv'))
## Rows: 450090 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): ride_id, rideable_type, start_station_name, end_station_name, memb...
## dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
neigh = st_read(here("data_raw", "DC_Health_Planning_Neighborhoods.geojson")) %>% clean_names()
## Reading layer `DC_Health_Planning_Neighborhoods' from data source
## `C:\Users\avery\Documents\ds241\ds241_f23\basta-dataforskare\basta-dataforskare\data_raw\DC_Health_Planning_Neighborhoods.geojson'
## using driver `GeoJSON'
## Simple feature collection with 51 features and 8 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -77.11976 ymin: 38.79165 xmax: -76.9094 ymax: 38.99556
## Geodetic CRS: WGS 84
This filters the data so we are only getting entries for the weekdays and not the weekends, appending location variables to station names, and combining repeat stations with a summed amount of entries.
#metroLoc = metroLoc |>
#rename("X" = "ï..X")
metroAddy <- subset(metroLoc, select = c(NAME, ADDRESS, X, Y))|>
rename("Station" = "NAME", "Lon" = "X", "Lat" = "Y")
metroRiders$Time.Period = NULL
metroRiders$Day.of.Week = NULL
metroRiders$Holiday = NULL
metroRiders$Month = NULL
metroRiders$Year = NULL
metroRiders$Avg.Daily.Entries.Rounded = NULL
#metroRiders = metroRiders |>
#rename("Station" = "ï..Station")
metroR1 <- metroRiders |>
filter(Servicetype == "Weekday") |>
ddply("Station", numcolwise(sum))
METRO <- merge(x = metroR1, y = metroAddy, by = "Station")
glimpse(METRO)
## Rows: 86
## Columns: 5
## $ Station <chr> "Anacostia", "Arlington Cemetery", "Ashburn", "Ballston-MU", "…
## $ Entries <int> 62365, 16111, 25199, 114875, 31518, 102933, 45638, 47644, 8514…
## $ ADDRESS <chr> "1101 HOWARD ROAD SE, WASHINGTON, DC", "1000 NORTH MEMORIAL DR…
## $ Lon <dbl> -76.99537, -77.06281, -77.49154, -77.11317, -76.93837, -77.094…
## $ Lat <dbl> 38.86297, 38.88469, 39.00529, 38.88219, 38.89098, 38.98440, 38…
bikeR1 is the data set originated from September Bikeshare data. It is filtered to keep the columns “started at”, “start lat” and “start_lng”. Na.omit gets rid of everything null, and mutate adds the date to when each bike ride started.
bikeR2 is a further filtering of bikeR1 where coordinates are added so we can map out the bike riders starting location.
bikeR3 is the new data set where bikeR2 and neigh are joined.
bikeR1 = sept_raw %>% select(started_at, start_lat, start_lng) %>% na.omit() %>% mutate(start_date=as.Date(started_at)) %>% select(start_date, start_lat, start_lng)
bikeR2 = bikeR1 %>% st_as_sf(coords=c("start_lng", "start_lat"), crs=4326)
st_crs(neigh$geometry[1])
## Coordinate Reference System:
## User input: WGS 84
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4326]]
bikeR3 = bikeR2 %>% st_join(neigh)
#code for possible future mapping
#df1_s_sf = df1_s %>% st_as_sf(coords =c("start_lng", "start_lat"), crs = 4326)
The first part of this code chunk is converting the metro data frame into a spatial data frame.
MetroMap2 is a filtration of MetroMap that joins the data set “neigh” and omits any null values. Then a variable ‘code’ is added to the numcolwise. There are 50 ‘codes’ created in this process. Then from those codes, we will determine rideship for both bikes and metro.
MetroMap <- st_as_sf(METRO, coords = c("Lon", "Lat"), crs =4326)
MetroMap2 <- MetroMap %>%
st_join(neigh) %>% na.omit() %>%
ddply("code", numcolwise(sum))
neigh1 is the new data frame of “neigh” where code and geometry are the chosen variables to be kept.
bike R4 is a further filtration of bikeR3, where start date, code, geometry is kept and geometry column is dropped.
bikeR5 is another filter of neigh1, where bikeR4 is added (joined). Additionally, each of the weekend dates are removed from the data set as we chose to only look at weekday data.
neigh1 = neigh %>% select(code, geometry)
bikeR4 = bikeR3 %>% select(start_date, code, geometry) %>% st_drop_geometry()
bikeR5 = neigh1 %>% full_join(bikeR4) %>% filter(start_date != as.Date('2023-09-02')) %>% filter(start_date != as.Date('2023-09-03')) %>% filter(start_date != as.Date('2023-09-09')) %>% filter(start_date != as.Date('2023-09-10')) %>% filter(start_date != as.Date('2023-09-16')) %>% filter(start_date != as.Date('2023-09-17')) %>% filter(start_date != as.Date('2023-09-23')) %>% filter(start_date != as.Date('2023-09-24')) %>% filter(start_date != as.Date('2023-09-30'))
## Joining with `by = join_by(code)`
bikeR6 is a nre data frame where we took the bike data from set bikeR5. bikeR6 has 51 codes and they are listed as observations. All null values are ommitted.
bikeR7 takes the data from bikeR6 and keeps the code as well as frequency and renames it to bike_freq.
#plot(neigh)
bikeR6 = data.frame(table(bikeR5$code)) %>% rename(code=Var1) %>% full_join(bikeR5) %>% select(code, Freq, geometry) %>% distinct() %>% na.omit()
## Joining with `by = join_by(code)`
bikeR7 = bikeR6 %>% select (code, Freq) %>% rename(bike_freq = Freq)
MetroMap3 = MetroMap2 %>% select(Entries, code) %>% rename(metro_freq = Entries)
metro_bike_df = bikeR7 %>% full_join(MetroMap3) %>% mutate(metro_freq = replace_na(metro_freq, 0))
## Joining with `by = join_by(code)`
#bikeR7 = bikeR5 %>% count(code, start_date)
#plot(bikeR6)
bikeR8 takes bikeR6 and keeps the code and frequency. It also creates a column called bike because all the data in this set is from bike riders. We will use this column later when we make our visual.
MetroMap4 continues the filtration of MetroMap2 where entries (later renamed to freq) and code are kept. Every data in this set is given the variable ‘metro’ as they represent a metro rider.
bikeR8 = bikeR6 %>% select (code, Freq) %>% rename(freq = Freq) %>% mutate(transport = 'bike')
MetroMap4 = MetroMap2 %>% select(Entries, code) %>% rename(freq = Entries) %>% mutate(transport = 'metro')
code = c("N1", "N10", "N11", "N14", "N15", "N16", "N2", "N20", "N21", "N22", "N26", "N27", "N28", "N3", "N32", "N33", "N34", "N36", "N37", "N4", "N40", "N41", "N45", "N46", "N47", "N49", "N5", "N50", "N51", "N6", "N8")
freq = c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
transport = c('metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro', 'metro')
metroExtra = data.frame(code, freq, transport)
MetroMap4 = MetroMap4 %>% rbind(metroExtra)
metro_bike_df2 = bikeR8 %>% full_join(MetroMap4)
## Joining with `by = join_by(code, freq, transport)`
This is a simple visual of the metro station locations in DC.
entrances=st_read(here("Metro_Station_Entrances_in_DC.geojson")) %>% clean_names()
## Reading layer `Metro_Station_Entrances_in_DC' from data source
## `C:\Users\avery\Documents\ds241\ds241_f23\basta-dataforskare\basta-dataforskare\Metro_Station_Entrances_in_DC.geojson'
## using driver `GeoJSON'
## Simple feature collection with 113 features and 23 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -77.08577 ymin: 38.84465 xmax: -76.93472 ymax: 38.97578
## Geodetic CRS: WGS 84
class(entrances)
## [1] "sf" "data.frame"
plot(entrances)
## Warning: plotting the first 9 out of 23 attributes; use max.plot = 23 to plot
## all
We create a gg plot of the data from above. The combined data set of metro and bike riders (metro_bike_df2). We wanted to visualize the number of people who are riding the metro vs using bikes in each of the ‘codes’.
charts <- ggplot(metro_bike_df2, aes(fill=transport, y=freq, x=code)) + geom_bar(position='dodge', stat='identity')
ggplotly(charts)
Based on the data comparisons of metro entries and bike entries, we would recommend that the bikeshare group look into increasing the amount of bike stations in neighborhoods: n1, n2, n3, n4, n6, n8, n10, n11, n14, n15, n16, n20, n21, n22, n26, n27, n28, n32, n33, n34, n36, n37, n40, n41, n45, n46, n47, n49, n50, n51.